Used libaries:

library(mosaic)
## Registered S3 method overwritten by 'mosaic':
##   method                           from   
##   fortify.SpatialPolygonsDataFrame ggplot2
## 
## The 'mosaic' package masks several functions from core packages in order to add 
## additional features.  The original behavior of these functions should not be affected by this.
## 
## Attache Paket: 'mosaic'
## Die folgenden Objekte sind maskiert von 'package:dplyr':
## 
##     count, do, tally
## Das folgende Objekt ist maskiert 'package:Matrix':
## 
##     mean
## Das folgende Objekt ist maskiert 'package:ggplot2':
## 
##     stat
## Die folgenden Objekte sind maskiert von 'package:stats':
## 
##     binom.test, cor, cor.test, cov, fivenum, IQR, median, prop.test,
##     quantile, sd, t.test, var
## Die folgenden Objekte sind maskiert von 'package:base':
## 
##     max, mean, min, prod, range, sample, sum
library(plotly)
## 
## Attache Paket: 'plotly'
## Das folgende Objekt ist maskiert 'package:mosaic':
## 
##     do
## Das folgende Objekt ist maskiert 'package:ggplot2':
## 
##     last_plot
## Das folgende Objekt ist maskiert 'package:stats':
## 
##     filter
## Das folgende Objekt ist maskiert 'package:graphics':
## 
##     layout
library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library(dplyr)
library(rpart)
library(caret)
## 
## Attache Paket: 'caret'
## Das folgende Objekt ist maskiert 'package:mosaic':
## 
##     dotPlot
library(psych)
## 
## Attache Paket: 'psych'
## Die folgenden Objekte sind maskiert von 'package:mosaic':
## 
##     logit, rescale
## Die folgenden Objekte sind maskiert von 'package:ggplot2':
## 
##     %+%, alpha
library(ggplot2)
library(ggcorrplot)
library(rela)

1. Read all samples and combine them

# Delete all variables
rm( list = ls() )
read_idle = read.csv("01_Idle.csv")
idle_data <- data.frame(read_idle)

read_run = read.csv("02_Running.csv")
run_data <- data.frame(read_run)

read_dab = read.csv("03_Dab.csv")
dab_data <- data.frame(read_dab)

read_siu = read.csv("04_Siu.csv")
siu_data <- data.frame(read_siu)

Rename ID correctly:

names(idle_data)[1] <- "ID"
names(run_data)[1] <- "ID"
names(dab_data)[1] <- "ID"
names(siu_data)[1] <- "ID"

Combined data

Overall in total there are 8985 rows

So in the dab data for Orientation.X and Orientation.Z we have the wrong data type. instead of

Basically we can’t scale before converting to numeric

idle_run <- rbind(idle_data, run_data)
irun_dab <- rbind(idle_run, dab_data)
motion_data <- rbind(irun_dab, siu_data)
idle_run$Orientation.X <- as.numeric(idle_run$Orientation.X)
colSums(is.na(idle_run))
##                     ID                 Author               Category 
##                      0                      0                      0 
##                 Sample Acceleration.Timestamp         Acceleration.X 
##                      0                      0                      0 
##         Acceleration.Y         Acceleration.Z      AngularVelocity.X 
##                      0                      0                      0 
##      AngularVelocity.Y      AngularVelocity.Z        MagneticField.X 
##                      0                      0                   1499 
##        MagneticField.Y        MagneticField.Z          Orientation.X 
##                   1499                   1499                      0 
##          Orientation.Y          Orientation.Z 
##                      0                      0

2. Do some Exploratory Data Analysis (EDA) on whole data:

motion_data_all <- data.frame(motion_data)
# Remove Magnetic, because there are many NA's in it
motion_data_all <- motion_data_all[,!names(motion_data_all) %in% c("MagneticField.X")]
motion_data_all <- motion_data_all[,!names(motion_data_all) %in% c("MagneticField.Y")]
motion_data_all <- motion_data_all[,!names(motion_data_all) %in% c("MagneticField.Z")]

# Convert columns to correct type
motion_data_all$Category <- as.factor(motion_data_all$Category)
motion_data_all$Acceleration.X <- as.numeric(motion_data_all$Acceleration.X)
motion_data_all$Orientation.X <- as.numeric(motion_data_all$Orientation.X)
motion_data_all$Orientation.Z <- as.numeric(motion_data_all$Orientation.Z)

More NA’s found after convertion

colSums(is.na(motion_data_all))
##                     ID                 Author               Category 
##                      0                      0                      0 
##                 Sample Acceleration.Timestamp         Acceleration.X 
##                      0                      0                      0 
##         Acceleration.Y         Acceleration.Z      AngularVelocity.X 
##                      0                      0                      1 
##      AngularVelocity.Y      AngularVelocity.Z          Orientation.X 
##                      1                      1                      0 
##          Orientation.Y          Orientation.Z 
##                      0                      0

Remove the NA’s

About 8584 rows left

motion_data_all <- na.omit(motion_data_all)
colSums(is.na(motion_data_all))
##                     ID                 Author               Category 
##                      0                      0                      0 
##                 Sample Acceleration.Timestamp         Acceleration.X 
##                      0                      0                      0 
##         Acceleration.Y         Acceleration.Z      AngularVelocity.X 
##                      0                      0                      0 
##      AngularVelocity.Y      AngularVelocity.Z          Orientation.X 
##                      0                      0                      0 
##          Orientation.Y          Orientation.Z 
##                      0                      0

Scale the data:

quant_var <- select(motion_data_all, c(6:14))
cat_var <- select(motion_data_all, c(3))

quant_var <- scale(quant_var)
motion_data_scale <- cbind(cat_var, quant_var)
motion_data_scale
motion_data_box <- select(motion_data_scale, c("Acceleration.X","Acceleration.Y","Acceleration.Z","AngularVelocity.X","AngularVelocity.Y","AngularVelocity.Z"))
boxplot(motion_data_box) +
   scale_x_discrete(guide = guide_axis(angle = 90))

## NULL
  #geom_violin(trim = FALSE) +
  #geom_boxplot() 
  #theme_minimal()

Train with data from Ahmed, Tobias, Saghar and Ronaldo

#motion_data_part <- subset(motion_data_all, Author == "Ahmed" | Author == "Tobias" | Author == "Saghar" | Author == "Ronaldo") #+ subset(motion_data_all, Author == "Tobias")
#motion_data_unknown <- subset(motion_data, Author == "Regan" | Author == "Darian") # 33 %

motion_data_part <- subset(motion_data_all, Author == "Ahmed" | Author == "Tobias"| Author == "Ronaldo"| Author == "Regan") 
motion_data_test <- subset(motion_data_all, Author == "Saghar")
colSums(is.na(motion_data_part))
##                     ID                 Author               Category 
##                      0                      0                      0 
##                 Sample Acceleration.Timestamp         Acceleration.X 
##                      0                      0                      0 
##         Acceleration.Y         Acceleration.Z      AngularVelocity.X 
##                      0                      0                      0 
##      AngularVelocity.Y      AngularVelocity.Z          Orientation.X 
##                      0                      0                      0 
##          Orientation.Y          Orientation.Z 
##                      0                      0
# For statistics
motion_data_all_stat <- data.frame(motion_data_all)
# Remove unrelevant columns
motion_data_all <- motion_data_all[,!names(motion_data_all) %in% c("ID", "Acceleration.Timestamp", "Author", "Sample")]

Write merged cleaned data to file:

write.csv(motion_data_all, "All Samples Clean.csv", row.names = FALSE)

Category data distribution

Stacked bar chart:

Seems like Darian and Ahmed have more compared to the others more motion data

cat_count <- group_by(motion_data_all_stat, Author, Category) %>%
  summarize(count=n())
## `summarise()` has grouped output by 'Author'. You can override using the
## `.groups` argument.
stack_bar <- ggplot(cat_count, aes(x = Author, y = count, fill = Category)) +
  geom_bar(stat = "identity") #+
  #geom_text(aes(label = count), vjust = -4.5)

ggplotly(stack_bar)

Correlation plot for numerical values:

Threshold: 0.2

Old one: Remaining features: Acceleration.X, Acceleration.Z, Orientation.X, Orientation.Y, Orientation.Z

New one: Remaining features: Acceleration.X, Acceleration.Y, Acceleration.Z, AngularVelocity.X, AngularVelocity.Y, AngularVelocity.Z

We remove the orientation, since everyone had a different phone position

motion_data_all_numeric <- data.frame(motion_data_all)
motion_data_all_numeric <- motion_data_all_numeric[,!names(motion_data_all_numeric) %in% c("Category")]

#Was for only for testing -> Darian: Everyone has different position of phone, thats why we should skip Orientation
#motion_data_all_numeric <- motion_data_all_numeric[,!names(motion_data_all_numeric) %in% c("Orientation.X", "Orientation.Y", "Orientation.Z")]
#motion_data_all_numeric$Category <- as.numeric(factor(motion_data_all_numeric$Category))
#motion_data_all_numeric$Category <- as.factor(motion_data_all_numeric$Category)
# Calculate the correlation matrix of the data frame
cor_matrix <- cor(motion_data_all_numeric)

# Visualize the correlation matrix using ggcorrplot
ggcorrplot(cor_matrix, hc.order = TRUE, type = "lower", 
           lab = TRUE, lab_size = 3, method = "circle")

We use only relevant columns for the model training

remove_col <- c("ID",  "Acceleration.Timestamp", "Author", "Sample", "Orientation.X", "Orientation.Y", "Orientation.Z")

motion_data_part <- motion_data_part[,!names(motion_data_part) %in% remove_col]
plot_data <- data.frame(motion_data_test)
plot_data <- plot_data[,!names(plot_data) %in% remove_col]

motion_data_part_numeric <- data.frame(plot_data)
motion_data_part_numeric <- motion_data_part_numeric[,!names(motion_data_part_numeric) %in% c("Category")]

#idle_tobias <- subset(motion_data_tobias[1:5], Category == "Idle")
ggpairs(data=motion_data_part_numeric,aes(color = plot_data$Category), title="Motion pair plot with quantiative variables",
  upper = list(
    continuous = wrap("cor", size = 0.75)
  )
)  

remove_col <- c("ID",  "Author", "Sample", "Orientation.X", "Orientation.Y", "Orientation.Z")
idle_activity = subset(motion_data, Category == "Running" & Author == "Tobias")
idle_activity <- idle_activity[,!names(idle_activity) %in% remove_col]

#test <- scale_x_datetime(breaks = date_breaks("1 hours"), labels=date_format("%H:%m"), expand = c(0,0))
#test
idle_plot <- group_by(idle_activity, Category) %>%
  ggplot(aes(x=Acceleration.Timestamp)) +
  labs( x = "Timestamp", y = "Acceleration") +
  geom_line(aes(y = Acceleration.X), color="dark green", alpha = 0.8) +
  geom_line(aes(y = Acceleration.Y), color="light blue", alpha = 0.8) +
  geom_line(aes(y = Acceleration.Z), color="dark orange", alpha = 0.8) 
ggplotly(idle_plot)

3. Train on whole data with selected features:

Train split: 80 %, Test split: 20 %

Since the features that we selected correlate good and are relevant, we skip the angular velocity

set.seed(10)

# Take variables from correlation analysis
feature_selection <- motion_data_part#[,c("Category", "Acceleration.X", "Acceleration.Y", "Acceleration.Z")]

train_index_all <- createDataPartition(feature_selection$Category, p =0.80, list = FALSE)
train_data_all<-feature_selection[train_index_all, ]
test_data_all<-feature_selection[-train_index_all, ]

Accuracy on train data with rf: 81.56 % without orientation

set.seed(6)
# 6: 89.8 %
control_par <- trainControl(method = "cv", number=4)
model_rf_all <- train(Category~.,
                      data=train_data_all, 
                      "rf",
                      trControl = control_par
                      )

model_rf_all
## Random Forest 
## 
## 4355 samples
##    6 predictor
##    4 classes: 'Dab', 'Idle', 'Running', 'Siu' 
## 
## No pre-processing
## Resampling: Cross-Validated (4 fold) 
## Summary of sample sizes: 3267, 3267, 3265, 3266 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##   2     0.8378841  0.7835066
##   4     0.8300790  0.7730415
##   6     0.8234207  0.7641610
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.

Random forest with cross validation 4 fold

cm_train_data <- confusionMatrix(model_rf_all)
cm_train_data
## Cross-Validated (4 fold) Confusion Matrix 
## 
## (entries are percentual average cell counts across resamples)
##  
##           Reference
## Prediction  Dab Idle Running  Siu
##    Dab     18.0  0.8     1.0  2.0
##    Idle     1.0 23.6     1.1  0.4
##    Running  1.9  1.2    22.3  2.3
##    Siu      1.9  0.7     1.8 19.9
##                             
##  Accuracy (average) : 0.8379

Accuracy on testing data with rf and cv: 83.37 % without orientation

set.seed(6)
## Generate predictions
rf_all_pred_test <- predict(model_rf_all,test_data_all) 
        
## Print the accuracy
accuracy <- mean(rf_all_pred_test == test_data_all$Category)*100
accuracy
## [1] 83.88582
cm_test_data <- confusionMatrix(rf_all_pred_test, test_data_all$Category)
cm_test_data
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Dab Idle Running Siu
##    Dab     200   11      13  19
##    Idle     11  254      12   8
##    Running  14   16     245  27
##    Siu      23    6      15 212
## 
## Overall Statistics
##                                           
##                Accuracy : 0.8389          
##                  95% CI : (0.8156, 0.8602)
##     No Information Rate : 0.2643          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.7848          
##                                           
##  Mcnemar's Test P-Value : 0.5823          
## 
## Statistics by Class:
## 
##                      Class: Dab Class: Idle Class: Running Class: Siu
## Sensitivity              0.8065      0.8850         0.8596     0.7970
## Specificity              0.9487      0.9612         0.9288     0.9463
## Pos Pred Value           0.8230      0.8912         0.8113     0.8281
## Neg Pred Value           0.9431      0.9588         0.9490     0.9349
## Prevalence               0.2284      0.2643         0.2624     0.2449
## Detection Rate           0.1842      0.2339         0.2256     0.1952
## Detection Prevalence     0.2238      0.2624         0.2781     0.2357
## Balanced Accuracy        0.8776      0.9231         0.8942     0.8717
plt <- as.data.frame(cm_test_data$table)
plt$Prediction <- factor(plt$Prediction, levels=rev(levels(plt$Prediction)))

rf_conf_mat <- ggplot(plt, aes(Prediction,Reference, fill= Freq)) +
        geom_tile() + geom_text(aes(label=Freq)) +
        scale_fill_gradient(low="white", high="#009194") +
        labs(x = "Prediction",y = "Reference") +
        scale_y_discrete(labels=c("Dab","Idle","Running","Siu")) +
        scale_x_discrete(labels=c("Siu", "Running", "Idle", "Dab")) 
        

ggplotly(rf_conf_mat)

6. Now test the best model from dab on unkown data and compare accuracy

remove_col <- c("ID",  "Acceleration.Timestamp", "Author", "Orientation.X", "Orientation.Y", "Orientation.Z")
motion_data_test <- motion_data_test[,!names(motion_data_test) %in% remove_col]
motion_data_test$Sample <- as.numeric(as.factor(motion_data_test$Sample))

unique(motion_data_test$Category)
## [1] Idle    Running Dab     Siu    
## Levels: Dab Idle Running Siu

Dab: 1 - 20 Idle: 11 - 20 Run: 22 - 30 Siu: 31 - 40

inspect(motion_data_test)
## 
## categorical variables:  
##       name  class levels    n missing
## 1 Category factor      4 1420       0
##                                    distribution
## 1 Running (47.6%), Dab (22.1%) ...             
## 
## quantitative variables:  
##                name   class       min         Q1    median         Q3      max
## 1            Sample numeric   1.00000 13.0000000 22.000000 27.0000000 40.00000
## 2    Acceleration.X numeric -19.24533  5.6348475  8.989855 10.3458525 74.95678
## 3    Acceleration.Y numeric -62.43217 -2.5990100 -1.087490 -0.0446425 10.99254
## 4    Acceleration.Z numeric -27.55201 -1.3236225  1.985545  4.8890625 40.44529
## 5 AngularVelocity.X numeric  -7.90234 -0.5865550 -0.007295  0.5926225 12.85294
## 6 AngularVelocity.Y numeric  -7.73286 -0.2825200  0.029700  0.3547175 10.01106
## 7 AngularVelocity.Z numeric -12.65705 -0.2874675 -0.006055  0.2679275  7.92185
##          mean       sd    n missing
## 1 20.17323944 9.586790 1420       0
## 2  8.94612076 8.124970 1420       0
## 3 -2.02695663 5.929216 1420       0
## 4  1.79015740 5.362013 1420       0
## 5  0.03390892 1.698911 1420       0
## 6  0.05902677 1.120474 1420       0
## 7 -0.03541651 1.196526 1420       0

Dab is not recognized at all: 10/10 are missclassified

Idle: 10 / 10 Samples with at least 70 % correct

Running: 10 / 10 Samples with at least 60 % correct

Siu: 9 / 10 Samples with at least 50 % correct

In total we have an avg accuracy of 60 %

list_motion_data_unknown = c()

total_accuracy <- 0
average_accuracy <- 0
for(i in 1:length(unique(motion_data_test$Sample))){
  #print(i)
  
  motion_data_unknown <- subset(motion_data_test,Sample == i) # 55.76 %
  ref <- motion_data_unknown$Category[motion_data_unknown$Sample == i]
  motion_data_unknown <- motion_data_unknown[,!names(motion_data_unknown) %in% c("Sample")]
  
  motion_data_no_labels <- data.frame(motion_data_unknown)
  names(motion_data_no_labels)[names(motion_data_no_labels) == "Category"] <- "Category"
  motion_data_no_labels$Category <- ""
  
  
  set.seed(6)
  ## Generate predictions
  rf_dab_pred_new <- predict(object = model_rf_all,newdata = motion_data_no_labels) 
          
  ## Print the accuracy
  accuracy <- mean(rf_dab_pred_new ==  motion_data_unknown$Category )*100
  total_accuracy <- total_accuracy + accuracy
  
  motion_data_no_labels$Category = rf_dab_pred_new

  cm_rf_all <- confusionMatrix(rf_dab_pred_new, motion_data_no_labels$Category)
  #print(cm_rf_all)
  test <- as.data.frame(cm_rf_all$table)
  
  
  print(paste("Reference: ", unique(ref), "Prediction: ", test$Prediction[which.max(test$Freq)], "Accuracy: ", accuracy, sep = " "))

  list_motion_data_unknown <- append(list_motion_data_unknown, motion_data_no_labels)
}
## [1] "Reference:  Idle Prediction:  Dab Accuracy:  48.4848484848485"
## [1] "Reference:  Idle Prediction:  Idle Accuracy:  76.6666666666667"
## [1] "Reference:  Idle Prediction:  Dab Accuracy:  14.2857142857143"
## [1] "Reference:  Idle Prediction:  Idle Accuracy:  80"
## [1] "Reference:  Idle Prediction:  Idle Accuracy:  76.6666666666667"
## [1] "Reference:  Idle Prediction:  Idle Accuracy:  63.3333333333333"
## [1] "Reference:  Idle Prediction:  Dab Accuracy:  48.3870967741936"
## [1] "Reference:  Idle Prediction:  Idle Accuracy:  64.5161290322581"
## [1] "Reference:  Idle Prediction:  Dab Accuracy:  18.5185185185185"
## [1] "Reference:  Idle Prediction:  Idle Accuracy:  85.1851851851852"
## [1] "Reference:  Dab Prediction:  Dab Accuracy:  78.125"
## [1] "Reference:  Dab Prediction:  Dab Accuracy:  65.625"
## [1] "Reference:  Dab Prediction:  Dab Accuracy:  68.75"
## [1] "Reference:  Dab Prediction:  Dab Accuracy:  71.875"
## [1] "Reference:  Dab Prediction:  Dab Accuracy:  68.75"
## [1] "Reference:  Dab Prediction:  Dab Accuracy:  62.5"
## [1] "Reference:  Dab Prediction:  Dab Accuracy:  68.75"
## [1] "Reference:  Dab Prediction:  Dab Accuracy:  62.5"
## [1] "Reference:  Dab Prediction:  Siu Accuracy:  43.75"
## [1] "Reference:  Dab Prediction:  Dab Accuracy:  57.6923076923077"
## [1] "Reference:  Running Prediction:  Running Accuracy:  53.968253968254"
## [1] "Reference:  Running Prediction:  Running Accuracy:  77.6119402985075"
## [1] "Reference:  Running Prediction:  Running Accuracy:  73.5294117647059"
## [1] "Reference:  Running Prediction:  Running Accuracy:  78.3132530120482"
## [1] "Reference:  Running Prediction:  Running Accuracy:  73.3333333333333"
## [1] "Reference:  Running Prediction:  Running Accuracy:  75"
## [1] "Reference:  Running Prediction:  Running Accuracy:  76.4705882352941"
## [1] "Reference:  Running Prediction:  Running Accuracy:  71.875"
## [1] "Reference:  Running Prediction:  Running Accuracy:  70.9677419354839"
## [1] "Reference:  Running Prediction:  Running Accuracy:  60.3448275862069"
## [1] "Reference:  Siu Prediction:  Siu Accuracy:  50"
## [1] "Reference:  Siu Prediction:  Siu Accuracy:  60"
## [1] "Reference:  Siu Prediction:  Siu Accuracy:  88.2352941176471"
## [1] "Reference:  Siu Prediction:  Siu Accuracy:  93.3333333333333"
## [1] "Reference:  Siu Prediction:  Siu Accuracy:  87.5"
## [1] "Reference:  Siu Prediction:  Running Accuracy:  0"
## [1] "Reference:  Siu Prediction:  Siu Accuracy:  80"
## [1] "Reference:  Siu Prediction:  Siu Accuracy:  84.6153846153846"
## [1] "Reference:  Siu Prediction:  Siu Accuracy:  56.25"
## [1] "Reference:  Siu Prediction:  Siu Accuracy:  100"
average_accuracy <- total_accuracy / length(unique(motion_data_test$Sample))

print(paste("AVG Accuracy: ", average_accuracy))
## [1] "AVG Accuracy:  65.8927457209973"

Accuracy on train data with knn: 77.29 % without orientation

set.seed(6)
# 6: 89.8 %
control_par <- trainControl(method = "cv", number=4)
model_knn <- train(Category~.,
                      data=train_data_all, 
                      "knn",
                      trControl = control_par,
                      metric = "Accuracy"
                      )

model_knn
## k-Nearest Neighbors 
## 
## 4355 samples
##    6 predictor
##    4 classes: 'Dab', 'Idle', 'Running', 'Siu' 
## 
## No pre-processing
## Resampling: Cross-Validated (4 fold) 
## Summary of sample sizes: 3267, 3267, 3265, 3266 
## Resampling results across tuning parameters:
## 
##   k  Accuracy   Kappa    
##   5  0.8020649  0.7358098
##   7  0.7993080  0.7320936
##   9  0.8006850  0.7339331
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 5.

KNN with cross validation 4 fold

cm_train_data <- confusionMatrix(model_knn)
cm_train_data
## Cross-Validated (4 fold) Confusion Matrix 
## 
## (entries are percentual average cell counts across resamples)
##  
##           Reference
## Prediction  Dab Idle Running  Siu
##    Dab     18.5  0.8     2.0  3.0
##    Idle     1.8 23.3     2.3  1.0
##    Running  1.0  1.4    20.3  2.5
##    Siu      1.5  0.8     1.7 18.0
##                             
##  Accuracy (average) : 0.8021

Accuracy on testing data with knn and cv: 82.07 % without orientation

set.seed(6)
## Generate predictions
knn_all_pred_test <- predict(model_knn,test_data_all) 
        
## Print the accuracy
accuracy <- mean(knn_all_pred_test == test_data_all$Category)*100
accuracy
## [1] 80.01842
cm_test_data <- confusionMatrix(knn_all_pred_test, test_data_all$Category)
cm_test_data
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Dab Idle Running Siu
##    Dab     200    7      22  22
##    Idle     20  255      29  11
##    Running  10   16     219  38
##    Siu      18    9      15 195
## 
## Overall Statistics
##                                           
##                Accuracy : 0.8002          
##                  95% CI : (0.7751, 0.8236)
##     No Information Rate : 0.2643          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.7331          
##                                           
##  Mcnemar's Test P-Value : 0.0003278       
## 
## Statistics by Class:
## 
##                      Class: Dab Class: Idle Class: Running Class: Siu
## Sensitivity              0.8065      0.8885         0.7684     0.7331
## Specificity              0.9391      0.9249         0.9201     0.9488
## Pos Pred Value           0.7968      0.8095         0.7739     0.8228
## Neg Pred Value           0.9425      0.9585         0.9178     0.9164
## Prevalence               0.2284      0.2643         0.2624     0.2449
## Detection Rate           0.1842      0.2348         0.2017     0.1796
## Detection Prevalence     0.2311      0.2901         0.2606     0.2182
## Balanced Accuracy        0.8728      0.9067         0.8443     0.8409
plt <- as.data.frame(cm_test_data$table)
plt$Prediction <- factor(plt$Prediction, levels=rev(levels(plt$Prediction)))

rf_conf_mat <- ggplot(plt, aes(Prediction,Reference, fill= Freq)) +
        geom_tile() + geom_text(aes(label=Freq)) +
        scale_fill_gradient(low="white", high="#009194") +
        labs(x = "Prediction",y = "Reference") +
        scale_y_discrete(labels=c("Dab","Idle","Running","Siu")) +
        scale_x_discrete(labels=c("Siu", "Running", "Idle", "Dab")) 
        

ggplotly(rf_conf_mat)

6. Now test the best model from dab on unkown data and compare accuracy

remove_col <- c("ID",  "Acceleration.Timestamp", "Author", "Orientation.X", "Orientation.Y", "Orientation.Z")
motion_data_test <- motion_data_test[,!names(motion_data_test) %in% remove_col]
motion_data_test$Sample <- as.numeric(as.factor(motion_data_test$Sample))

unique(motion_data_test$Category)
## [1] Idle    Running Dab     Siu    
## Levels: Dab Idle Running Siu

Dab: 1 - 20 Idle: 11 - 20 Run: 22 - 30 Siu: 31 - 40

inspect(motion_data_test)
## 
## categorical variables:  
##       name  class levels    n missing
## 1 Category factor      4 1420       0
##                                    distribution
## 1 Running (47.6%), Dab (22.1%) ...             
## 
## quantitative variables:  
##                name   class       min         Q1    median         Q3      max
## 1            Sample numeric   1.00000 13.0000000 22.000000 27.0000000 40.00000
## 2    Acceleration.X numeric -19.24533  5.6348475  8.989855 10.3458525 74.95678
## 3    Acceleration.Y numeric -62.43217 -2.5990100 -1.087490 -0.0446425 10.99254
## 4    Acceleration.Z numeric -27.55201 -1.3236225  1.985545  4.8890625 40.44529
## 5 AngularVelocity.X numeric  -7.90234 -0.5865550 -0.007295  0.5926225 12.85294
## 6 AngularVelocity.Y numeric  -7.73286 -0.2825200  0.029700  0.3547175 10.01106
## 7 AngularVelocity.Z numeric -12.65705 -0.2874675 -0.006055  0.2679275  7.92185
##          mean       sd    n missing
## 1 20.17323944 9.586790 1420       0
## 2  8.94612076 8.124970 1420       0
## 3 -2.02695663 5.929216 1420       0
## 4  1.79015740 5.362013 1420       0
## 5  0.03390892 1.698911 1420       0
## 6  0.05902677 1.120474 1420       0
## 7 -0.03541651 1.196526 1420       0

Dab is not recognized at all: 10/10 are missclassified

Idle: 10 / 10 Samples with at least 70 % correct

Running: 10 / 10 Samples with at least 60 % correct

Siu: 9 / 10 Samples with at least 50 % correct

In total we have an avg accuracy of 60 %

total_accuracy <- 0
average_accuracy <- 0
for(i in 1:length(unique(motion_data_test$Sample))){
  #print(i)
  
  motion_data_unknown <- subset(motion_data_test,Sample == i) # 55.76 %
  ref <- motion_data_unknown$Category[motion_data_unknown$Sample == i]
  motion_data_unknown <- motion_data_unknown[,!names(motion_data_unknown) %in% c("Sample")]
  
  motion_data_no_labels <- data.frame(motion_data_unknown)
  names(motion_data_no_labels)[names(motion_data_no_labels) == "Category"] <- "Category"
  motion_data_no_labels$Category <- ""
  
  
  set.seed(6)
  ## Generate predictions
  knn_pred_new <- predict(object = model_knn,newdata = motion_data_no_labels) 
          
  ## Print the accuracy
  accuracy <- mean(knn_pred_new ==  motion_data_unknown$Category )*100
  total_accuracy <- total_accuracy + accuracy
  
  motion_data_no_labels$Category = knn_pred_new

  cm_rf_all <- confusionMatrix(knn_pred_new, motion_data_no_labels$Category)
  #print(cm_rf_all)
  test <- as.data.frame(cm_rf_all$table)
  
  
  print(paste("Reference: ", unique(ref), "Prediction: ", test$Prediction[which.max(test$Freq)], "Accuracy: ", accuracy, sep = " "))
}
## [1] "Reference:  Idle Prediction:  Idle Accuracy:  66.6666666666667"
## [1] "Reference:  Idle Prediction:  Idle Accuracy:  93.3333333333333"
## [1] "Reference:  Idle Prediction:  Idle Accuracy:  57.1428571428571"
## [1] "Reference:  Idle Prediction:  Idle Accuracy:  96"
## [1] "Reference:  Idle Prediction:  Idle Accuracy:  86.6666666666667"
## [1] "Reference:  Idle Prediction:  Idle Accuracy:  83.3333333333333"
## [1] "Reference:  Idle Prediction:  Idle Accuracy:  90.3225806451613"
## [1] "Reference:  Idle Prediction:  Idle Accuracy:  90.3225806451613"
## [1] "Reference:  Idle Prediction:  Idle Accuracy:  74.0740740740741"
## [1] "Reference:  Idle Prediction:  Idle Accuracy:  100"
## [1] "Reference:  Dab Prediction:  Dab Accuracy:  68.75"
## [1] "Reference:  Dab Prediction:  Dab Accuracy:  50"
## [1] "Reference:  Dab Prediction:  Dab Accuracy:  46.875"
## [1] "Reference:  Dab Prediction:  Dab Accuracy:  50"
## [1] "Reference:  Dab Prediction:  Dab Accuracy:  40.625"
## [1] "Reference:  Dab Prediction:  Dab Accuracy:  46.875"
## [1] "Reference:  Dab Prediction:  Dab Accuracy:  53.125"
## [1] "Reference:  Dab Prediction:  Dab Accuracy:  50"
## [1] "Reference:  Dab Prediction:  Siu Accuracy:  25"
## [1] "Reference:  Dab Prediction:  Dab Accuracy:  65.3846153846154"
## [1] "Reference:  Running Prediction:  Running Accuracy:  63.4920634920635"
## [1] "Reference:  Running Prediction:  Running Accuracy:  74.6268656716418"
## [1] "Reference:  Running Prediction:  Running Accuracy:  77.9411764705882"
## [1] "Reference:  Running Prediction:  Running Accuracy:  73.4939759036145"
## [1] "Reference:  Running Prediction:  Running Accuracy:  78.6666666666667"
## [1] "Reference:  Running Prediction:  Running Accuracy:  75"
## [1] "Reference:  Running Prediction:  Running Accuracy:  75"
## [1] "Reference:  Running Prediction:  Running Accuracy:  64.0625"
## [1] "Reference:  Running Prediction:  Running Accuracy:  62.9032258064516"
## [1] "Reference:  Running Prediction:  Running Accuracy:  53.448275862069"
## [1] "Reference:  Siu Prediction:  Running Accuracy:  40"
## [1] "Reference:  Siu Prediction:  Siu Accuracy:  45"
## [1] "Reference:  Siu Prediction:  Siu Accuracy:  70.5882352941177"
## [1] "Reference:  Siu Prediction:  Siu Accuracy:  53.3333333333333"
## [1] "Reference:  Siu Prediction:  Siu Accuracy:  68.75"
## [1] "Reference:  Siu Prediction:  Siu Accuracy:  100"
## [1] "Reference:  Siu Prediction:  Siu Accuracy:  60"
## [1] "Reference:  Siu Prediction:  Siu Accuracy:  76.9230769230769"
## [1] "Reference:  Siu Prediction:  Siu Accuracy:  43.75"
## [1] "Reference:  Siu Prediction:  Siu Accuracy:  66.6666666666667"
average_accuracy <- total_accuracy / length(unique(motion_data_test$Sample))

average_accuracy
## [1] 66.45357

Accuracy on train data with rpart: 55.69 % without orientation

set.seed(6)
# 6: 89.8 %
control_par <- trainControl(method = "cv", number=4)
model_rpart <- train(Category~.,
                      data=train_data_all, 
                      "rpart",
                      trControl = control_par,
                      metric = "Accuracy"
                      )

model_rpart
## CART 
## 
## 4355 samples
##    6 predictor
##    4 classes: 'Dab', 'Idle', 'Running', 'Siu' 
## 
## No pre-processing
## Resampling: Cross-Validated (4 fold) 
## Summary of sample sizes: 3267, 3267, 3265, 3266 
## Resampling results across tuning parameters:
## 
##   cp          Accuracy   Kappa    
##   0.08143526  0.5161941  0.3482533
##   0.08642746  0.4877288  0.3098354
##   0.22745710  0.3048900  0.0572642
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.08143526.
# Basic plot for a decision tree
  plot(model_rpart$finalModel,branch = T, margin = 0.1)
  text(model_rpart$finalModel)

Rpart with cross validation 4 fold

cm_train_data <- confusionMatrix(model_rpart)
cm_train_data
## Cross-Validated (4 fold) Confusion Matrix 
## 
## (entries are percentual average cell counts across resamples)
##  
##           Reference
## Prediction  Dab Idle Running  Siu
##    Dab      0.0  0.0     0.0  0.0
##    Idle    10.9 24.4    10.6  4.7
##    Running  3.1  0.8    10.1  2.7
##    Siu      8.8  1.1     5.6 17.1
##                             
##  Accuracy (average) : 0.5162

Accuracy on testing data with rpart and cv: 52.07 % without orientation

set.seed(6)
## Generate predictions
rpart_all_pred_test <- predict(model_rpart,test_data_all) 
        
## Print the accuracy
accuracy <- mean(rpart_all_pred_test == test_data_all$Category)*100
accuracy
## [1] 49.07919
cm_test_data <- confusionMatrix(rpart_all_pred_test, test_data_all$Category)
cm_test_data
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Dab Idle Running Siu
##    Dab       0    0       0   0
##    Idle    111  272     145  65
##    Running  19    2      71  11
##    Siu     118   13      69 190
## 
## Overall Statistics
##                                          
##                Accuracy : 0.4908         
##                  95% CI : (0.4607, 0.521)
##     No Information Rate : 0.2643         
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.3145         
##                                          
##  Mcnemar's Test P-Value : < 2.2e-16      
## 
## Statistics by Class:
## 
##                      Class: Dab Class: Idle Class: Running Class: Siu
## Sensitivity              0.0000      0.9477        0.24912     0.7143
## Specificity              1.0000      0.5982        0.96005     0.7561
## Pos Pred Value              NaN      0.4587        0.68932     0.4872
## Neg Pred Value           0.7716      0.9696        0.78230     0.8908
## Prevalence               0.2284      0.2643        0.26243     0.2449
## Detection Rate           0.0000      0.2505        0.06538     0.1750
## Detection Prevalence     0.0000      0.5460        0.09484     0.3591
## Balanced Accuracy        0.5000      0.7730        0.60459     0.7352
plt <- as.data.frame(cm_test_data$table)
plt$Prediction <- factor(plt$Prediction, levels=rev(levels(plt$Prediction)))

rf_conf_mat <- ggplot(plt, aes(Prediction,Reference, fill= Freq)) +
        geom_tile() + geom_text(aes(label=Freq)) +
        scale_fill_gradient(low="white", high="#009194") +
        labs(x = "Prediction",y = "Reference") +
        scale_y_discrete(labels=c("Dab","Idle","Running","Siu")) +
        scale_x_discrete(labels=c("Siu", "Running", "Idle", "Dab")) 
        

ggplotly(rf_conf_mat)

6. Now test the rpart model on unkown data and compare accuracy

remove_col <- c("ID",  "Acceleration.Timestamp", "Author", "Orientation.X", "Orientation.Y", "Orientation.Z")
motion_data_test <- motion_data_test[,!names(motion_data_test) %in% remove_col]
motion_data_test$Sample <- as.numeric(as.factor(motion_data_test$Sample))

unique(motion_data_test$Category)
## [1] Idle    Running Dab     Siu    
## Levels: Dab Idle Running Siu

Dab: 1 - 20 Idle: 11 - 20 Run: 22 - 30 Siu: 31 - 40

inspect(motion_data_test)
## 
## categorical variables:  
##       name  class levels    n missing
## 1 Category factor      4 1420       0
##                                    distribution
## 1 Running (47.6%), Dab (22.1%) ...             
## 
## quantitative variables:  
##                name   class       min         Q1    median         Q3      max
## 1            Sample numeric   1.00000 13.0000000 22.000000 27.0000000 40.00000
## 2    Acceleration.X numeric -19.24533  5.6348475  8.989855 10.3458525 74.95678
## 3    Acceleration.Y numeric -62.43217 -2.5990100 -1.087490 -0.0446425 10.99254
## 4    Acceleration.Z numeric -27.55201 -1.3236225  1.985545  4.8890625 40.44529
## 5 AngularVelocity.X numeric  -7.90234 -0.5865550 -0.007295  0.5926225 12.85294
## 6 AngularVelocity.Y numeric  -7.73286 -0.2825200  0.029700  0.3547175 10.01106
## 7 AngularVelocity.Z numeric -12.65705 -0.2874675 -0.006055  0.2679275  7.92185
##          mean       sd    n missing
## 1 20.17323944 9.586790 1420       0
## 2  8.94612076 8.124970 1420       0
## 3 -2.02695663 5.929216 1420       0
## 4  1.79015740 5.362013 1420       0
## 5  0.03390892 1.698911 1420       0
## 6  0.05902677 1.120474 1420       0
## 7 -0.03541651 1.196526 1420       0

Dab is not recognized at all: 10/10 are missclassified

Idle: 10 / 10 Samples with at least 70 % correct

Running: 10 / 10 Samples with at least 60 % correct

Siu: 9 / 10 Samples with at least 50 % correct

In total we have an avg accuracy of 60 %

total_accuracy <- 0
average_accuracy <- 0
for(i in 1:length(unique(motion_data_test$Sample))){
  #print(i)
  
  motion_data_unknown <- subset(motion_data_test,Sample == i) # 55.76 %
  ref <- motion_data_unknown$Category[motion_data_unknown$Sample == i]
  motion_data_unknown <- motion_data_unknown[,!names(motion_data_unknown) %in% c("Sample")]
  
  motion_data_no_labels <- data.frame(motion_data_unknown)
  names(motion_data_no_labels)[names(motion_data_no_labels) == "Category"] <- "Category"
  motion_data_no_labels$Category <- ""
  
  
  set.seed(6)
  ## Generate predictions
  rpart_pred_new <- predict(object = model_rpart,newdata = motion_data_no_labels) 
          
  ## Print the accuracy
  accuracy <- mean(rpart_pred_new ==  motion_data_unknown$Category )*100
  total_accuracy <- total_accuracy + accuracy
  
  motion_data_no_labels$Category = rpart_pred_new

  cm_rf_all <- confusionMatrix(rpart_pred_new, motion_data_no_labels$Category)
  #print(cm_rf_all)
  test <- as.data.frame(cm_rf_all$table)
  
  
  print(paste("Reference: ", unique(ref), "Prediction: ", test$Prediction[which.max(test$Freq)], "Accuracy: ", accuracy, sep = " "))
}
## [1] "Reference:  Idle Prediction:  Idle Accuracy:  75.7575757575758"
## [1] "Reference:  Idle Prediction:  Idle Accuracy:  100"
## [1] "Reference:  Idle Prediction:  Idle Accuracy:  71.4285714285714"
## [1] "Reference:  Idle Prediction:  Idle Accuracy:  100"
## [1] "Reference:  Idle Prediction:  Idle Accuracy:  96.6666666666667"
## [1] "Reference:  Idle Prediction:  Idle Accuracy:  96.6666666666667"
## [1] "Reference:  Idle Prediction:  Idle Accuracy:  90.3225806451613"
## [1] "Reference:  Idle Prediction:  Idle Accuracy:  100"
## [1] "Reference:  Idle Prediction:  Idle Accuracy:  100"
## [1] "Reference:  Idle Prediction:  Idle Accuracy:  100"
## [1] "Reference:  Dab Prediction:  Siu Accuracy:  0"
## [1] "Reference:  Dab Prediction:  Siu Accuracy:  0"
## [1] "Reference:  Dab Prediction:  Siu Accuracy:  0"
## [1] "Reference:  Dab Prediction:  Siu Accuracy:  0"
## [1] "Reference:  Dab Prediction:  Siu Accuracy:  0"
## [1] "Reference:  Dab Prediction:  Siu Accuracy:  0"
## [1] "Reference:  Dab Prediction:  Siu Accuracy:  0"
## [1] "Reference:  Dab Prediction:  Idle Accuracy:  0"
## [1] "Reference:  Dab Prediction:  Siu Accuracy:  0"
## [1] "Reference:  Dab Prediction:  Siu Accuracy:  0"
## [1] "Reference:  Running Prediction:  Siu Accuracy:  36.5079365079365"
## [1] "Reference:  Running Prediction:  Siu Accuracy:  19.4029850746269"
## [1] "Reference:  Running Prediction:  Siu Accuracy:  22.0588235294118"
## [1] "Reference:  Running Prediction:  Siu Accuracy:  26.5060240963855"
## [1] "Reference:  Running Prediction:  Siu Accuracy:  26.6666666666667"
## [1] "Reference:  Running Prediction:  Siu Accuracy:  26.4705882352941"
## [1] "Reference:  Running Prediction:  Siu Accuracy:  26.4705882352941"
## [1] "Reference:  Running Prediction:  Siu Accuracy:  32.8125"
## [1] "Reference:  Running Prediction:  Siu Accuracy:  30.6451612903226"
## [1] "Reference:  Running Prediction:  Siu Accuracy:  27.5862068965517"
## [1] "Reference:  Siu Prediction:  Siu Accuracy:  55"
## [1] "Reference:  Siu Prediction:  Siu Accuracy:  70"
## [1] "Reference:  Siu Prediction:  Siu Accuracy:  82.3529411764706"
## [1] "Reference:  Siu Prediction:  Siu Accuracy:  80"
## [1] "Reference:  Siu Prediction:  Siu Accuracy:  93.75"
## [1] "Reference:  Siu Prediction:  Siu Accuracy:  100"
## [1] "Reference:  Siu Prediction:  Siu Accuracy:  80"
## [1] "Reference:  Siu Prediction:  Siu Accuracy:  100"
## [1] "Reference:  Siu Prediction:  Siu Accuracy:  81.25"
## [1] "Reference:  Siu Prediction:  Siu Accuracy:  91.6666666666667"
average_accuracy <- total_accuracy / length(unique(motion_data_test$Sample))

average_accuracy
## [1] 50.99973

Good correlation between:

  • Orientation.X and Orientation.Y with 0.56

  • Orientation.Z and Orientation.Y with 0.48

  • Category and Orientation.Y with 0.43

  • Orientation.Z and Orientation.X with 0.54

Angular Velocity is quite independent compared to other variables, but we still included it, to determine the activities better.

Since everyone used another phone position, we removed the orientation, because it also didn’t really improve the model accuracy.

We also remove magnetic field, because there were a lot of NA’s and even by using data were we didn’t have NA’s it, the models didn’t really improve with it. About 40 % on unseen data.

In total we tried three different models: Random forest, KNN and Rpart (Decision Tree).

Accuracies:

  • Prediction with random forest: Train data: 81.56 %, Test data: 83.37 % Unseen data: 60.48 %

  • Prediction with knn: Train data: 77.29 %, Test data: 82.08 % Unseen data: 59.52 %

  • Prediction with rpart: Train data: 55.69 %, Test data: 52.07 % Unseen data: 48.50 %

Classification:

  • The problem is that the models do not recognize dab at all.

  • The reason for that is, because Dab is kinda included into Siu.

  • That’s why they also missclassified Dab with Siu.

So, we decided to use another acitivity called lunge and train test the models.